In [2]:
import pandas as pd
import numpy as np

from numpy import exp, sqrt, dot
from scipy.spatial.distance import cdist

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold
from sklearn.svm import SVR

from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval

Import data


In [3]:
def load_data():
    full_data = pd.read_csv("Data/X.csv")
    train_y = pd.read_csv("Data/y_train.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    
    # Move ID column to the beginning
    id_column = full_data["id"]
    full_data.drop(labels=["id"], axis=1, inplace = True)
    full_data.insert(0, "id", id_column)
    
    # Add the target value column to the training part
    # in full_data
    split = 98000
    y_id_dict = train_y.set_index("id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)
    
    # Split into training and testing data
    train, test = full_data[:split], full_data[split:]
    return (train, test)

train, test = load_data()

Set parameters


In [4]:
random_seed = 8
# Set folds for out-of-fold prediction
n_folds = 5

Preprocessing


In [5]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data can make training faster and reduce
# the chances of getting stuck in local optima
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

1. Distribution regression

Kernel mean embedding

Instead of fitting a model to the instances, the idea of distribution regression is to find a regression on the underlying probability distributions the instances come from. It is based on the assumption that the data is $\{(x_i, y_i)\}_{i=1}^{n}$ with:

  • $n$ the number of bags in the dataset ;
  • $x_i$ the probability distribution of bag $i$ ;
  • $y_i$ is the aerosol optical depth of bag $i$.

However, $x_i$ is not observed: for each bag $i$, the $100$ instances $x_{i,l}$, $l=1,...,100$, are samples from the distribution $x_i$. Our dataset is thus $\{(\{x_{i,l}\}_{l=1}^{100}, y_i)\}_{i=1}^{n}$ and we want to find a mapping $\hat{f}$ that will best predict unseen bags.

The mapping $\hat{f}$ on $\{(\{x_{i,l}\}_{l=1}^{100}, y_i)\}_{i=1}^{n}$ will try to learn the relationship between the true distributions $\{x_i\}_{i=1}^{n}$ and the target values $\{y_i\}_{i=1}^{n}$. To achieve that, the information of the 100 instances in each bag has to be summarised whilst losing the less information possible. The aggregated approach that simply computes the mean of the features for each bag is an example of information summary, yet plenty of data is lost that way. A better way to represent each bag is via kernel mean embedding: $$\mu_{\hat{x}_i} = \frac{1}{100}\sum_{l=1}^{100} k(\cdot, x_{i,l})$$

Each bag is represented as a linear combination of kernels, and with the right choice of kernel, the lost information can be very negligible.

Kernel Ridge Regression

We now want to find $\hat{f}$ that minimises the following regularised least square problem: $$ \underset{f}{arg min} \sum_{i=1}^{n} (f(\mu_{\hat{x}_i}) - y_i)^2 + \lambda \Vert f \Vert^2$$

with $\lambda>0$ the L2 regularisation parameter.

In kernel ridge regression, $f$ is interpreted as a linear combination of feature space mappings $\phi$ of the data points $\mu_{\hat{x}_i}$: $$ f = \sum_{i=1}^{n} \alpha_i \phi(\mu_{\hat{x}_i} ) $$

The equation thus becomes: $$ \underset{\alpha}{arg min} (\Vert y -K\alpha \Vert^2 + \lambda \alpha^T K \alpha)$$ with :

  • $K(i,j) = k'(\mu_{\hat{x}_i} , \mu_{\hat{x}_j})$ for $i,j=1..n$ ;
  • $k'$ another kernel.

By differentiating with respect to $\alpha$ and setting it to zero: $$ \alpha^{*} = (K + \lambda I_n)^{-1}y $$

For the sake of simplicity and because the results proved to be reasonably good, we set $k'$ as the linear kernel and as a result: $$ K(i,j) = \frac{1}{100^2} \sum_{l,k=1}^{100} k(x_{i,l} , x_{j,k})$$


In [6]:
class Kernel(object):
    """ Kernel class from Zoltan Szabo
        giving the kernel mean embedding.
    """

    def __init__(self, par=None):
        """ Initialization.

        Parameters
        ----------
        par : dictionary, optional
              Name of the kernel and its parameters (default is
              {"name": "RBF", "sigma": 1}). The name of the kernel comes
              from "RBF", "exponential", "Cauchy", "student", "Matern3p2",
              "Matern5p2", "polynomial", "ratquadr" (rational quadratic),
              "invmquadr" (inverse multiquadr).
        """
        if par is None:
            par = {"name": "RBF", "sigma": 1}

        name = par["name"]
        self.name = name

        # other attributes:
        if name == "RBF" or name == "exponential" or name == "Cauchy":
            self.sigma = par["sigma"]
        elif name == "student":
            self.d = par["d"]
        elif name == "Matern3p2" or name == "Matern5p2":
            self.l = par["l"]
        elif name == "polynomial":
            self.c = par["c"]
            self.exponent = par["exponent"]
        elif name == "ratquadr" or name == "invmquadr":
            self.c = par["c"]
        else:
            raise Exception("kernel=?")

    def gram_matrix(self, y1, y2):
        """  Compute the Gram matrix = [k(y1[i,:], y2[j,:])]; i, j: running.

        Parameters
        ----------
        y1 : (number of samples1, dimension)-ndarray
             One row of y1 corresponds to one sample.
        y2 : (number of samples2, dimension)-ndarray
             One row of y2 corresponds to one sample.

        Returns
        -------
        g : ndarray.
            Gram matrix of y1 and y2.
        """

        if self.name == "RBF":
            sigma = self.sigma
            g = cdist(y1, y2)
            g = exp(-g ** 2 / (2 * sigma ** 2))
        elif self.name == "exponential":
            sigma = self.sigma
            g = cdist(y1, y2)
            g = exp(-g / (2 * sigma ** 2))
        elif self.name == "Cauchy":
            sigma = self.sigma
            g = cdist(y1, y2)
            g = 1 / (1 + g ** 2 / sigma ** 2)
        elif self.name == "student":
            d = self.d
            g = cdist(y1, y2)
            g = 1 / (1 + g ** d)
        elif self.name == "Matern3p2":
            l = self.l
            g = cdist(y1, y2) 
            g = (1 + sqrt(3) * g / l) * exp(-sqrt(3) * g / l)
        elif self.name == "Matern5p2":
            l = self.l
            g = cdist(y1, y2)
            g = (1 + sqrt(5) * g / l + 5 * g ** 2 / (3 * l ** 2)) * \
                exp(-sqrt(5) * g / l)
        elif self.name == "polynomial":
            c = self.c
            exponent = self.exponent
            g = (dot(y1, y2.T) + c) ** exponent
        elif self.name == "ratquadr":
            c = self.c
            g = cdist(y1, y2) ** 2
            g = 1 - g / (g + c)
        elif self.name == "invmquadr":
            c = self.c
            g = cdist(y1, y2)
            g = 1 / sqrt(g ** 2 + c ** 2)
        else:
            raise Exception("kernel=?")

        return g

In [7]:
# Compute the linear kernel product of 
# the mean embedding of X1 and X2
# denoted as K(i, j) above
def mean_embedding(X1, X2, kernel):
    k = Kernel(kernel)
    gram_mat = k.gram_matrix(X1, X2)
    # Number of instances in the bag
    N = float(gram_mat.shape[0])
    mu_X1_X2 = gram_mat.ravel().sum() / N**2
    return (mu_X1_X2)

# Return a symmetrised matrix
def symmetrise(A):
    return(A + A.T - np.diag(A.diagonal()))

# Compute the Gram matrix K given the kernel and 
# the smoothing parameter theta
def compute_gram(df, kernel, theta):
    nb_bag = df["id"].nunique()
    K_matrix = np.zeros((nb_bag, nb_bag))
    print("Computing {0} Gram matrix for theta={1}:".format(kernel, theta))
    for i in range(nb_bag):
        if (i%50 == 0):
            print("Bag number: {0}". format(i))
        
        for j in range(i+1):
            # Compute mean embedding
            X1 = df.loc[train["id"] == (i+1), cols_orig].values
            X2 = df.loc[train["id"] == (j+1), cols_orig].values

            K_matrix[i, j] = mean_embedding(X1, X2, {'name': kernel, 'sigma': theta})
            
    return symmetrise(K_matrix)
        
#K_cauchy = compute_gram(train, "Cauchy", 2**4)

In [8]:
# Class for kernel ridge regression
class RidgeRegression(object):
    def __init__(self, l2_reg):
        self.l2_reg = l2_reg

    def fit(self, G, y):
        # Train size
        n_train = G.shape[0]
        ridge_mat = G + (self.l2_reg * n_train) * np.identity(n_train)
        self.ridge_mat = ridge_mat
        # Shape of y_train is (1, n_train)
        self.y_train = y

    def predict(self, G_test):
        y_test_hat = self.y_train.dot(np.linalg.solve(self.ridge_mat, G_test))
        return y_test_hat

Kernel selection

A kernel is characterised by a parameter we will call $\theta$ and the ridge regression depends on the L2 regularisation $\lambda$. Through cross-validation, we selected the kernels giving the most stable validation loss. They are given below with their associated parameters:

  • Cauchy: $$k_C(a,b) = \dfrac{1}{1 + \dfrac{\Vert a-b\Vert_2^2}{\theta^2}}, \quad\theta_C = 16, \quad\lambda_C = 2^{-23} $$
  • Matérn 5/2: $$k_M(a,b) = \left(1 + \dfrac{\sqrt{5}\Vert a-b\Vert_2^2}{\theta} + \dfrac{5\Vert a-b\Vert_2^2}{3\theta^2} \right)e^{-\dfrac{\sqrt{5}\Vert a-b\Vert_2^2}{\theta}}, \quad\theta_M = 64, \quad\lambda_M = 2^{-31} $$
  • Rational quadratic: $$k_r(a,b) = 1 - \dfrac{\Vert a-b\Vert_2^2}{\Vert a-b\Vert_2^2 + \theta}, \quad\theta_r = 512, \quad\lambda_r = 2^{-26}$$

2. Stacking

We will then map the features in the three spaces that describes the data in different ways. Each kernel ridge regression gives a prediction of the labels and combining them might give a better result for three reasons:

  • Statistical reason: we might not have enough data and even if each model $h_i$ performs well on the training set, the true model $f$ might still be not reached ;
  • Computational reason: each model $h_i$ only finds a local optima ;
  • Representational reason: the true model is out of the representation of functions we're considering.

Combining our model might take us a step closer to finding the true model $f$. The ensembling technique we used was out-of-fold stacking.

Out-of-fold prediction

In the first stage, out-of-fold prediction is applied to ensure that each first-layer regressor does not overfit by predicting on data already seen. For each regressor, we iteratively separate the training data in $N$ folds ($N=5$ in our model), and then use N-1 folds to train the model and then predict the target value of the remaining fold. To create the new testing set, the average of the predictions of each fold is taken.


In [9]:
# G_train and G_test are pandas dataframes
# krr is a kernel ridge regression
def oof_prediction(krr, G_train, y_train, G_test, n_folds, random_seed):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    n_train = G_train.shape[0]
    n_test = G_test.shape[1]
    oof_train = np.zeros(n_train)
    oof_test = np.zeros(n_test)
    oof_test_folds = np.zeros((n_test, n_folds))

    for i, (train_index, test_index) in enumerate(kf.split(G_train)):
        G_tr = G_train.loc[train_index, train_index].values
        y_tr = y_train[train_index].reshape((1, -1))
        G_te = G_train.loc[train_index, test_index].values

        krr.fit(G_tr, y_tr)
        oof_train[test_index] = krr.predict(G_te)
        G_test_partial = G_test.loc[train_index, :]
        oof_test_folds[:, i] = krr.predict(G_test_partial.values)

    oof_test = oof_test_folds.mean(axis=1)
    return oof_train, oof_test

In [10]:
nb_bags_train = 980
# Create a vector with the unique values of y for each ID.
y_train = train.groupby("id")["y"].median().values

In [11]:
# Load Gram matrices
def load_gram(csv_file, nb_bags_train):
    # Import data
    G = pd.read_csv(csv_file, header=None)
    idx_train = nb_bags_train - 1
    idx_test = nb_bags_train
    G_train = G.loc[:idx_train, :idx_train]
    G_test = G.loc[:idx_train, idx_test:]
    return (G_train, G_test)

# Define models and import Gram matrices
# Cauchy
l2_reg_cauchy = 2**(-23)
cauchy = RidgeRegression(l2_reg_cauchy)
G_train_cauchy, G_test_cauchy = load_gram("kernels_me/Cauchy_16.csv", nb_bags_train)

# Matern 5/2
l2_reg_matern_52 = 2**(-31)
matern_52 = RidgeRegression(l2_reg_matern_52)
G_train_matern_52, G_test_matern_52 = load_gram("kernels_me/Matern_52_64.csv", nb_bags_train)

# Rational quadratic
l2_reg_rquadr = 2**(-26)
rquadr = RidgeRegression(l2_reg_rquadr)
G_train_rquadr, G_test_rquadr = load_gram("kernels_me/rquadr_512.csv", nb_bags_train)

In [12]:
# Create OOF train and test predictions
# Cauchy
cauchy_oof_train, cauchy_oof_test = oof_prediction(cauchy, G_train_cauchy, 
                                                   y_train, G_test_cauchy,
                                                   n_folds, random_seed)
# Matern 5/2
matern_52_oof_train, matern_52_oof_test = oof_prediction(matern_52, G_train_matern_52, 
                                                         y_train, G_test_matern_52,
                                                         n_folds, random_seed)
# Rational quadratic
rquadr_oof_train, rquadr_oof_test = oof_prediction(rquadr, G_train_rquadr, 
                                                   y_train, G_test_rquadr,
                                                   n_folds, random_seed)
print("Training is finished.")


Training is finished.

Second stage prediction with SVR

In the second stage, a Support Vector Regression is fed with the different predictions from the kernel ridge regression to predict the target value $y$. The SVR uses these predictions to compute the optimal weights assigned to each kernel regression and we might hope to find a better optimum to approximate the true regression function $f$.


In [13]:
# Building the new data frames using the
# of out-of-fold predictions
kernel_train = pd.DataFrame({'cauchy': cauchy_oof_train,
                             'matern_52': matern_52_oof_train,
                             'rquadr': rquadr_oof_train})

kernel_train["y"] = y_train

kernel_test = pd.DataFrame({'cauchy': cauchy_oof_test,
                            'matern_52': matern_52_oof_test,
                            'rquadr': rquadr_oof_test})

cols_excl_kernel = ["y"]
cols_kernel = [c for c in kernel_train.columns if c not in cols_excl_kernel]

kernel_train.head()


Out[13]:
cauchy matern_52 rquadr y
0 -3.553306 -3.497803 -3.584285 -3.998082
1 -4.238140 -4.246409 -4.246125 -4.137141
2 -2.585482 -2.645614 -2.563139 -2.694732
3 -3.893719 -3.965724 -3.902051 -3.296275
4 -3.684191 -3.718706 -3.654680 -3.181391

Tuning SVR's parameters


In [18]:
# Root mean squared error metric
def RMSE(y, y_hat):
    out = np.sqrt(mean_squared_error(y.reshape((-1,)), y_hat.reshape((-1,))))
    return (out)

In [19]:
def scoring_function(parameters):
    print("Training the model with parameters: ")
    print(parameters)
    
    # Run several KFold shuffles and take the mean RMSE
    average_RMSE = []
    nb_run = 10
    
    for m in range(nb_run):
        KFold_RMSE = 0.0
        n_splits = 5

        kf = KFold(n_splits=n_splits, shuffle=True, random_state=(random_seed+m))
        nb_fold = 0
        for train_index, validation_index in kf.split(kernel_train):
            nb_fold += 1
            train_fold, validation_fold = kernel_train.loc[train_index], kernel_train.loc[validation_index]

            svr = SVR(C=parameters["C"], epsilon=parameters["epsilon"])
            svr.fit(train_fold[cols_kernel], train_fold["y"])

            y_hat_test = svr.predict(validation_fold[cols_kernel])
            RMSE_test = RMSE(y_hat_test, validation_fold["y"].values)

            KFold_RMSE += RMSE_test

        KFold_RMSE /= n_splits
        
        average_RMSE.append(KFold_RMSE)
    
    average_RMSE = np.array(average_RMSE)

    print("Cross-validation score: {0} +/- {1}\n".format(average_RMSE.mean(),
                                                         2*average_RMSE.std()))
    
    return {"loss": average_RMSE.mean(), "status": STATUS_OK}

In [50]:
# Grid to pick parameters from.
parameters_grid = {"C":       hp.choice("C", np.arange(0.5, 3, 0.5)),
                   "epsilon": hp.choice("epsilon", np.arange(0.05, 0.25, 0.05))
                  }
# Record the information about the cross-validation.
trials = Trials()

best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=10, 
            trials=trials)


Training the model with parameters: 
{'epsilon': 0.050000000000000003, 'C': 1.5}
Cross-validation score: 0.686196784675 +/- 0.00310294556159

Training the model with parameters: 
{'epsilon': 0.10000000000000001, 'C': 0.5}
Cross-validation score: 0.688370690304 +/- 0.00292268709048

Training the model with parameters: 
{'epsilon': 0.050000000000000003, 'C': 2.0}
Cross-validation score: 0.685749746794 +/- 0.00324723589177

Training the model with parameters: 
{'epsilon': 0.20000000000000001, 'C': 1.5}
Cross-validation score: 0.684769970815 +/- 0.00224621885902

Training the model with parameters: 
{'epsilon': 0.15000000000000002, 'C': 2.5}
Cross-validation score: 0.685005070367 +/- 0.00345812467647

Training the model with parameters: 
{'epsilon': 0.10000000000000001, 'C': 1.0}
Cross-validation score: 0.686645044299 +/- 0.00267387859763

Training the model with parameters: 
{'epsilon': 0.15000000000000002, 'C': 1.0}
Cross-validation score: 0.685922402595 +/- 0.0022856147877

Training the model with parameters: 
{'epsilon': 0.15000000000000002, 'C': 0.5}
Cross-validation score: 0.687533030786 +/- 0.00241636467058

Training the model with parameters: 
{'epsilon': 0.10000000000000001, 'C': 1.5}
Cross-validation score: 0.686081435086 +/- 0.00327535796236

Training the model with parameters: 
{'epsilon': 0.050000000000000003, 'C': 2.0}
Cross-validation score: 0.685749746794 +/- 0.00324723589177


In [51]:
min(trials.losses())


Out[51]:
0.684769970815494

In [52]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in 
                                zip(space_eval(parameters_grid, best).keys(),
                                    space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("Parameters/best_parameters_SVR.csv", encoding="utf-8", index=False)

best_parameters


Out[52]:
C epsilon score
0 1.5 0.2 0.68477

Prediction


In [57]:
best_parameters = pd.read_csv("Parameters/best_parameters_SVR.csv", encoding="utf-8")
best_parameters


Out[57]:
C epsilon score
0 1.5 0.2 0.68477

In [58]:
svr = SVR(C=best_parameters["C"][0],
          epsilon=best_parameters["epsilon"][0])
svr.fit(kernel_train[cols_kernel], y_train)


Out[58]:
SVR(C=1.5, cache_size=200, coef0=0.0, degree=3, epsilon=0.20000000000000001,
  gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False)

In [59]:
# Training error
RMSE(svr.predict(kernel_train[cols_kernel]), y_train)


Out[59]:
0.67393208259018023

In [60]:
# Prediction
y_hat_test = svr.predict(kernel_test[cols_kernel])

test_pred = test.groupby("id")[["y"]].mean().reset_index()
test_pred["y"] = y_hat_test
test_pred.columns = ["Id", "y"]

# Save as a .csv
test_pred.to_csv("Predictions/Prediction_SVR.csv", index=False)

In [ ]: